This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

library(xgboost)
library(Matrix)
library(mclust)
ds0 <- readRDS("./ds0.rds")
ds1 <- readRDS("./ds1.rds")
ds2 <- readRDS("./ds2.rds")

分发训练集

Idents(ds2) <- ds2$conditions
ds2_AC <- subset(ds2, idents = "AC")
ds2_PA <- subset(ds2, idents = "PA")
ds2_AC <- ds2_AC %>% FindNeighbors(dims = 1:20) %>% FindClusters(resolution = 0.1)
ds2_PA <- ds2_PA %>% FindNeighbors(dims = 1:20) %>% FindClusters(resolution = 0.1)
umapplot(ds2_AC) + scale_y_continuous(limits = c(-5,15),breaks = NULL) +
        scale_x_continuous(limits = c(-5,15),breaks = NULL)
umapplot(ds2_PA)+ scale_y_continuous(limits = c(-5,15),breaks = NULL) +
        scale_x_continuous(limits = c(-5,15),breaks = NULL)

AC_markers <- FindAllMarkers(ds2_AC,logfc.threshold = 0.7, min.diff.pct = 0.2)
PA_markers <- FindAllMarkers(ds2_PA,logfc.threshold = 0.7, min.diff.pct = 0.2)

在AC上训练

AC_data <- get_data_table(ds2_AC, highvar = F, type = "data")
AC_label <- as.numeric(as.character(Idents(ds2_AC)))

set.seed(7)
index <- c(1:dim(AC_data)[2]) %>% sample(ceiling(0.3*dim(AC_data)[2]), replace = F, prob = NULL)

colnames(AC_data) <- NULL

AC_train_data <- list(data = t(as(AC_data[,-index],"dgCMatrix")), label = AC_label[-index])
AC_test_data <- list(data = t(as(AC_data[,index],"dgCMatrix")), label = AC_label[index])

# data(agaricus.train, package='xgboost')

AC_train <- xgb.DMatrix(data = AC_train_data$data,label = AC_train_data$label)
AC_test <- xgb.DMatrix(data = AC_test_data$data,label = AC_test_data$label)

# xgb_params_train = {
#     'objective':'multi:softprob',
#     'eval_metric':'mlogloss',
#     'num_class':self.numbertrainclasses,
#     'eta':0.2,
#     'max_depth':6,
#     'subsample': 0.6}
# nround = 200

watchlist <- list(train = AC_train, eval = AC_test)
xgb_param <- list(eta = 0.2, max_depth = 6, 
                  subsample = 0.6,  num_class = length(table(Idents(ds2_AC))),
                  objective = "multi:softmax", eval_metric = 'mlogloss')

bst_model <- xgb.train(xgb_param, AC_train, nrounds = 200, watchlist, verbose = 0)

# 特征提取
importance <- xgb.importance(colnames(AC_train), model = bst_model)
head(importance)
xgb.ggplot.importance(head(importance,20),n_clusters = 1) #这个cluster和分类没有关系


multi_featureplot(head(importance,9)$Feature,ds2)

AC_genes <- head(importance, 500) ##选择top500


#混淆矩阵
predict_AC_test <- round(predict(bst_model, newdata = AC_test))

AC_confuse_matrix_test <- table(AC_test_data$label, predict_AC_test, dnn=c("true","pre"))
AC_confuse_matrix_test_prop <- prop.table(AC_confuse_matrix_test, 1)
AC_confuse_matrix_test_prop
    pre
true           0           1           2           3
   0 0.994652406 0.002673797 0.002673797 0.000000000
   1 0.000000000 0.987755102 0.012244898 0.000000000
   2 0.000000000 0.026455026 0.962962963 0.010582011
   3 0.000000000 0.009009009 0.054054054 0.936936937
#ROC曲线

# xgboost_roc <- pROC::multiclass.roc(AC_test_data$label, predict_AC_test) #多分类ROC
xgboost_roc <- pROC::roc(AC_test_data$label, predict_AC_test)
Warning in roc.default(AC_test_data$label, predict_AC_test) :
  'response' has more than two levels. Consider setting 'levels' explicitly or using 'multiclass.roc' instead
Setting levels: control = 0, case = 1
Setting direction: controls < cases
plot(xgboost_roc, print.auc=TRUE, auc.polygon=TRUE, 
  grid=c(0.1, 0.2),grid.col=c("green", "red"), 
  max.auc.polygon=TRUE,auc.polygon.col="skyblue", 
  print.thres=TRUE,main='ROC curve') #前两个分量

在PA上训练

PA_data <- get_data_table(ds2_PA, highvar = F, type = "data")
PA_label <- as.numeric(as.character(Idents(ds2_PA)))

set.seed(7)
index <- c(1:dim(PA_data)[2]) %>% sample(ceiling(0.3*dim(PA_data)[2]), replace = F, prob = NULL)

colnames(PA_data) <- NULL

PA_train_data <- list(data = t(as(PA_data[,-index],"dgCMatrix")), label = PA_label[-index])
PA_test_data <- list(data = t(as(PA_data[,index],"dgCMatrix")), label = PA_label[index])

# data(agaricus.train, pPAkage='xgboost')

PA_train <- xgb.DMatrix(data = PA_train_data$data,label = PA_train_data$label)
PA_test <- xgb.DMatrix(data = PA_test_data$data,label = PA_test_data$label)

# xgb_params_train = {
#     'objective':'multi:softprob',
#     'eval_metric':'mlogloss',
#     'num_class':self.numbertrainclasses,
#     'eta':0.2,
#     'max_depth':6,
#     'subsample': 0.6}
# nround = 200

watchlist <- list(train = PA_train, eval = PA_test)
xgb_param <- list(eta = 0.2, max_depth = 6, 
                  subsample = 0.6,  num_class = length(table(Idents(ds2_PA))),
                  objective = "multi:softmax", eval_metric = 'mlogloss')

bst_model <- xgb.train(xgb_param, PA_train, nrounds = 200, watchlist, verbose = 0)

选择特征common genes of top 500

使用所有来自PA的细胞训练分类器

应用在AC上,计算ARI

selected_features <- intersect(PA_genes$Feature, AC_genes$Feature)

PA_data <- get_data_table(ds2_PA, highvar = F, type = "data")
PA_data <- PA_data[selected_features,]
PA_label <- as.numeric(as.character(Idents(ds2_PA)))
colnames(PA_data) <- NULL

PA_train_data <- list(data = t(as(PA_data,"dgCMatrix")), label = PA_label)

PA_train <- xgb.DMatrix(data = PA_train_data$data,label = PA_train_data$label)

xgb_param <- list(eta = 0.2, max_depth = 6, 
                  subsample = 0.6,  num_class = length(table(Idents(ds2_PA))),
                  objective = "multi:softmax", eval_metric = 'mlogloss')

bst_model <- xgb.train(xgb_param, PA_train, nrounds = 200, verbose = 1)

# 特征提取
importance <- xgb.importance(colnames(PA_train), model = bst_model)
head(importance)
xgb.ggplot.importance(head(importance,20),n_clusters = 1)


multi_featureplot(head(importance,9)$Feature, ds2)

应用到AC上

AC_data <- get_data_table(ds2_AC, highvar = F, type = "data")
AC_data <- AC_data[selected_features,]
AC_label <- as.numeric(as.character(Idents(ds2_AC)))
colnames(AC_data) <- NULL

AC_test_data <- list(data = t(as(AC_data,"dgCMatrix")), label = AC_label)

AC_test <- xgb.DMatrix(data = AC_test_data$data,label = AC_test_data$label)

#计算混淆矩阵
predict_AC_test <- round(predict(bst_model, newdata = AC_test))

AC_confuse_matrix_test <- table(AC_test_data$label, predict_AC_test, dnn=c("true","pre"))
AC_confuse_matrix_test_prop <- prop.table(AC_confuse_matrix_test,1)
AC_confuse_matrix_test_prop  #分析发育轨迹
    pre
true           0           1           2
   0 0.992635025 0.004091653 0.003273322
   1 0.824879227 0.172705314 0.002415459
   2 0.449922958 0.497688752 0.052388290
   3 0.002762431 0.066298343 0.930939227
#ROC曲线
# xgboost_roc <- pROC::multiclass.roc(AC_test_data$label, predict_AC_test) #多分类ROC
xgboost_roc <- pROC::roc(AC_test_data$label, predict_AC_test)
Warning in roc.default(AC_test_data$label, predict_AC_test) :
  'response' has more than two levels. Consider setting 'levels' explicitly or using 'multiclass.roc' instead
Setting levels: control = 0, case = 1
Setting direction: controls < cases
plot(xgboost_roc, print.auc=TRUE, auc.polygon=TRUE, 
  grid=c(0.1, 0.2),grid.col=c("green", "red"), 
  max.auc.polygon=TRUE,auc.polygon.col="skyblue", 
  print.thres=TRUE,main='ROC curve') #前两个分量ROC


multi_featureplot(head(importance,4)$Feature, ds2, split.by = "conditions")

multi_featureplot(head(importance,9)$Feature, ds2_AC)


# 计算ARI 
 
adjustedRandIndex(predict_AC_test, AC_test_data$label)
[1] 0.307929

选择特征common genes of top 500

使用所有来自AC的细胞训练分类器

AC_data <- get_data_table(ds2_AC, highvar = F, type = "data")
AC_data <- AC_data[selected_features,]
AC_label <- as.numeric(as.character(Idents(ds2_AC)))
colnames(AC_data) <- NULL

AC_train_data <- list(data = t(as(AC_data,"dgCMatrix")), label = AC_label)

AC_train <- xgb.DMatrix(data = AC_train_data$data,label = AC_train_data$label)

xgb_ACram <- list(eta = 0.2, max_depth = 6, 
                  subsample = 0.6,  num_class = length(table(Idents(ds2_AC))),
                  objective = "multi:softmax", eval_metric = 'mlogloss')

bst_model2 <- xgb.train(xgb_ACram, AC_train, nrounds = 200, verbose = 1)

# 特征提取
importance2 <- xgb.importance(colnames(AC_train), model = bst_model2)
head(importance2)
xgb.ggplot.importance(head(importance2,20),n_clusters = 1)

multi_featureplot(head(importance2,9)$Feature, ds2)

应用在PA上,计算ARI

PA_data <- get_data_table(ds2_PA, highvar = F, type = "data")
PA_data <- PA_data[selected_features,]
PA_label <- as.numeric(as.character(Idents(ds2_PA)))
colnames(PA_data) <- NULL

PA_test_data <- list(data = t(as(PA_data,"dgCMatrix")), label = PA_label)

PA_test <- xgb.DMatrix(data = PA_test_data$data,label = PA_test_data$label)

#计算混淆矩阵
predict_PA_test <- round(predict(bst_model2, newdata = PA_test))

PA_confuse_matrix_test <- table(PA_test_data$label, predict_PA_test, dnn=c("true","pre"))
PA_confuse_matrix_test_prop <- prop.table(PA_confuse_matrix_test,1)
PA_confuse_matrix_test_prop  #分析发育轨迹

#ROC曲线
# xgboost_roc <- pROC::multiclass.roc(PA_test_data$label, predict_PA_test) #多分类ROC
xgboost_roc <- pROC::roc(PA_test_data$label, predict_PA_test)
plot(xgboost_roc, print.auc=TRUE, auc.polygon=TRUE, 
  grid=c(0.1, 0.2),grid.col=c("green", "red"), 
  max.auc.polygon=TRUE,auc.polygon.col="skyblue", 
  print.thres=TRUE,main='ROC curve') #前两个分量ROC

# 计算ARI 

adjustedRandIndex(predict_PA_test, PA_test_data$label)
umapplot(ds2,split.by = "conditions")
table(ds2$conditions)

sankey plot

PA -> AC ARI 0.3024837

pre

true 0 1 2 0 0.980360065 0.003273322 0.016366612 1 0.799516908 0.196859903 0.003623188 2 0.453004622 0.493066256 0.053929122 3 0.002762431 0.052486188 0.944751381 ## AC ->PA ARI 0.1797689 pre true 0 1 2 3 0 0.027107438 0.287272727 0.682644628 0.002975207 1 0.000349895 0.075227432 0.914975507 0.009447166 2 0.008130081 0.003252033 0.175609756 0.813008130

library(plotly)

plot_ly(type = "sankey", orientation = "h",
    node = list(
      label = c("PA_0", "PA_1", "PA_2", "AC_0","AC_1","AC_2","AC_3"), 
      color = colors_list, pad = 15, thickness = 30,
      line = list(
        color = "black",
        width = 1)),
    link = list(
      source = c(0,0,0,0,1,1,1,1,2,2,2,2),
      target = c(3,4,5,6,3,4,5,6,3,4,5,6),
      value =  as.numeric(AC_confuse_matrix_test),
      color = c("#66C2A5","#66C2A5","#66C2A5","#66C2A5","#FC8D62","#FC8D62","#FC8D62",
                "#FC8D62","#8DA0CB","#8DA0CB","#8DA0CB","#8DA0CB")
      ))

AC_confuse_matrix_test
    pre
true    0    1    2
   0 1213    5    4
   1  683  143    2
   2  292  323   34
   3    1   24  337
t(PA_confuse_matrix_test)
   true
pre    0    1    2
  0   73    3    4
  1  853  198    3
  2 2094 2634  108
  3    5   23  500
# fig <- fig %>% layout(
#     title = "Basic Sankey Diagram",
#     font = list(
#       size = 10 ))
# 

umapplot(ds2_AC)

umapplot(ds2_PA)

umapplot(ds2,split.by = "conditions")

varify 部分

数据集CA_dataset1

在AC上训练 使用top500 in AC

temp <- get_data_table(ds2_AC, highvar = F, type = "data")
AC_data <- matrix(data=0, nrow = length(AC_genes$Feature), ncol = length(colnames(temp)), byrow = FALSE, dimnames = list(AC_genes$Feature,colnames(temp)))

for(i in intersect(AC_genes$Feature, rownames(temp))){
  AC_data[i,] <- temp[i,]
}
rm(temp)

AC_label <- as.numeric(as.character(Idents(ds2_AC)))

set.seed(7)
index <- c(1:dim(AC_data)[2]) %>% sample(ceiling(0.3*dim(AC_data)[2]), replace = F, prob = NULL)

colnames(AC_data) <- NULL

AC_train_data <- list(data = t(as(AC_data[,-index],"dgCMatrix")), label = AC_label[-index])
AC_test_data <- list(data = t(as(AC_data[,index],"dgCMatrix")), label = AC_label[index])

AC_train <- xgb.DMatrix(data = AC_train_data$data,label = AC_train_data$label)
AC_test <- xgb.DMatrix(data = AC_test_data$data,label = AC_test_data$label)


watchlist <- list(train = AC_train, eval = AC_test)
xgb_param <- list(eta = 0.2, max_depth = 6, 
                  subsample = 0.6,  num_class = length(table(Idents(ds2_AC))),
                  objective = "multi:softmax", eval_metric = 'mlogloss')

bst_model <- xgb.train(xgb_param, AC_train, nrounds = 200, watchlist, verbose = 0)

# 特征提取
importance <- xgb.importance(colnames(AC_train), model = bst_model)
head(importance)
xgb.ggplot.importance(head(importance,20),n_clusters = 1) #这个cluster和分类没有关系

multi_featureplot(head(importance,9)$Feature,ds2)


#混淆矩阵
predict_AC_test <- round(predict(bst_model, newdata = AC_test))

AC_confuse_matrix_test <- table(AC_test_data$label, predict_AC_test, dnn=c("true","pre"))
AC_confuse_matrix_test_prop <- prop.table(AC_confuse_matrix_test, 1)
AC_confuse_matrix_test_prop
#ROC曲线

# xgboost_roc <- pROC::multiclass.roc(AC_test_data$label, predict_AC_test) #多分类ROC
xgboost_roc <- pROC::roc(AC_test_data$label, predict_AC_test)
plot(xgboost_roc, print.auc=TRUE, auc.polygon=TRUE, 
  grid=c(0.1, 0.2),grid.col=c("green", "red"), 
  max.auc.polygon=TRUE,auc.polygon.col="skyblue", 
  print.thres=TRUE,main='ROC curve') #前两个分量
ds1 <- ds1 %>% FindNeighbors(dims = 1:20) %>% FindClusters(resolution = 0.3)
umapplot(ds1)
ds1 <- RenameIdents(ds1,'0' = '0','1' ='0','2'='1','3'='2','4' = '3','5' = '1')
f("FBLN1",ds1)
ds1_markers <- FindAllMarkers(ds1,logfc.threshold = 0.5, min.diff.pct = 0.2)
temp <- get_data_table(ds1, highvar = F, type = "data")
ds1_data <- matrix(data=0, nrow = length(AC_genes$Feature), ncol = length(colnames(temp)), byrow = FALSE, dimnames = list(AC_genes$Feature,colnames(temp)))

for(i in intersect(AC_genes$Feature, rownames(temp))){
  ds1_data[i,] <- temp[i,]
}
rm(temp)

ds1_label <- as.numeric(as.character(Idents(ds1)))
colnames(ds1_data) <- NULL

ds1_test_data <- list(data = t(as(ds1_data,"dgCMatrix")), label = ds1_label)

ds1_test <- xgb.DMatrix(data = ds1_test_data$data,label = ds1_test_data$label)

#计算混淆矩阵
predict_ds1_test <- round(predict(bst_model, newdata = ds1_test))

ds1_data_confuse_matrix_test <- table(ds1_test_data$label, predict_ds1_test, dnn=c("true","pre"))
ds1_data_confuse_matrix_test_prop <- prop.table(ds1_data_confuse_matrix_test,1)
ds1_data_confuse_matrix_test
ds1_data_confuse_matrix_test_prop  #分析发育轨迹



#ROC曲线
# xgboost_roc <- pROC::multiclass.roc(ds1_test_data$label, predict_ds1_test) #多分类ROC
xgboost_roc <- pROC::roc(ds1_test_data$label, predict_ds1_test)
plot(xgboost_roc, print.auc=TRUE, auc.polygon=TRUE, 
  grid=c(0.1, 0.2),grid.col=c("green", "red"), 
  max.auc.polygon=TRUE,auc.polygon.col="skyblue", 
  print.thres=TRUE,main='ROC curve')  #前两个分量ROC

# 计算ARI 

adjustedRandIndex(predict_ds1_test, ds1_test_data$label)

冠状动脉数据集

ds0 <- ds0 %>% FindNeighbors(dims = 1:20) %>% FindClusters(resolution = 0.1)
umapplot(ds0)
f("MYH11",ds0)
ds0_markers <- FindAllMarkers(ds0,logfc.threshold = 0.7, min.diff.pct = 0.2)
selected_features <- AC_genes$Feature
temp <- get_data_table(ds0, highvar = F, type = "data")
ds0_data <- matrix(data=0, nrow = length(selected_features), ncol = length(colnames(temp)), byrow = FALSE, dimnames = list(selected_features,colnames(temp)))


for(i in intersect(selected_features,rownames(temp))){
  ds0_data[i,] <- temp[i,]
}
# rm(temp)

ds0_label <- as.numeric(as.character(Idents(ds0)))
colnames(ds0_data) <- NULL

ds0_test_data <- list(data = t(as(ds0_data,"dgCMatrix")), label = ds0_label)

ds0_test <- xgb.DMatrix(data = ds0_test_data$data,label = ds0_test_data$label)

#计算混淆矩阵
predict_ds0_test <- round(predict(bst_model, newdata = ds0_test))

ds0_data_confuse_matrix_test <- table(ds0_test_data$label, predict_ds0_test, dnn=c("true","pre"))
ds0_data_confuse_matrix_test_prop <- prop.table(ds0_data_confuse_matrix_test,1)
ds0_data_confuse_matrix_test
ds0_data_confuse_matrix_test_prop  #分析发育轨迹



#ROC曲线
# xgboost_roc <- pROC::multiclass.roc(ds0_test_data$label, predict_ds0_test) #多分类ROC
xgboost_roc <- pROC::roc(ds0_test_data$label, predict_ds0_test)
plot(xgboost_roc, print.auc=TRUE, auc.polygon=TRUE, 
  grid=c(0.1, 0.2),grid.col=c("green", "red"), 
  max.auc.polygon=TRUE,auc.polygon.col="skyblue", 
  print.thres=TRUE,main='ROC curve') #前两个分量ROC

# 计算ARI 

adjustedRandIndex(predict_ds0_test, ds0_test_data$label)
# load("./init.RData")
multi_featureplot(head(importance2,9)$Feature, ds2_AC)
multi_featureplot(head(importance2,9)$Feature, ds0)
multi_featureplot(head(importance2,9)$Feature, ds1)
f("MYH11", ds2_AC)
umapplot(ds0)

淋巴细胞

Idents(lym_ds2) <- lym_ds2$conditions
lym_ds2_AC <- subset(lym_ds2, idents = "AC")
lym_ds2_PA <- subset(lym_ds2, idents = "PA")
lym_ds2_AC <- lym_ds2_AC %>% FindNeighbors(dims = 1:20) %>% FindClusters(resolution = 0.2)
umapplot(lym_ds2_AC)
lym_ds2_PA <- lym_ds2_PA %>% FindNeighbors(dims = 1:20) %>% FindClusters(resolution = 0.2)
umapplot(lym_ds2_PA)

用PA的lym训练

lym_PA_data <- get_data_table(lym_ds2_PA, highvar = F, type = "data")
lym_PA_label <- as.numeric(as.character(Idents(lym_ds2_PA)))

set.seed(7)
index <- c(1:dim(lym_PA_data)[2]) %>% sample(ceiling(0.3*dim(lym_PA_data)[2]), replace = F, prob = NULL)

colnames(lym_PA_data) <- NULL

lym_PA_train_data <- list(data = t(as(lym_PA_data[,-index],"dgCMatrix")), label = lym_PA_label[-index])
lym_PA_test_data <- list(data = t(as(lym_PA_data[,index],"dgCMatrix")), label = lym_PA_label[index])

# data(agaricus.train, plym_PAkage='xgboost')

lym_PA_train <- xgb.DMatrix(data = lym_PA_train_data$data,label = lym_PA_train_data$label)
lym_PA_test <- xgb.DMatrix(data = lym_PA_test_data$data,label = lym_PA_test_data$label)

watchlist <- list(train = lym_PA_train, eval = lym_PA_test)
xgb_param <- list(eta = 0.2, max_depth = 6, 
                  subsample = 0.6,  num_class = length(table(Idents(lym_ds2_PA))),
                  objective = "multi:softmax", eval_metric = 'mlogloss')

bst_model <- xgb.train(xgb_param, lym_PA_train, nrounds = 200, watchlist, verbose = 1)
[1] train-mlogloss:1.364683 eval-mlogloss:1.386518 
[2] train-mlogloss:1.104948 eval-mlogloss:1.141078 
[3] train-mlogloss:0.921658 eval-mlogloss:0.970349 
[4] train-mlogloss:0.782067 eval-mlogloss:0.838498 
[5] train-mlogloss:0.672917 eval-mlogloss:0.737674 
[6] train-mlogloss:0.584400 eval-mlogloss:0.656063 
[7] train-mlogloss:0.511279 eval-mlogloss:0.589930 
[8] train-mlogloss:0.450917 eval-mlogloss:0.535140 
[9] train-mlogloss:0.399006 eval-mlogloss:0.489075 
[10]    train-mlogloss:0.356262 eval-mlogloss:0.450757 
[11]    train-mlogloss:0.321021 eval-mlogloss:0.418092 
[12]    train-mlogloss:0.289741 eval-mlogloss:0.391501 
[13]    train-mlogloss:0.263458 eval-mlogloss:0.368993 
[14]    train-mlogloss:0.239869 eval-mlogloss:0.348346 
[15]    train-mlogloss:0.219948 eval-mlogloss:0.330793 
[16]    train-mlogloss:0.201978 eval-mlogloss:0.316338 
[17]    train-mlogloss:0.186481 eval-mlogloss:0.302551 
[18]    train-mlogloss:0.173125 eval-mlogloss:0.292528 
[19]    train-mlogloss:0.160651 eval-mlogloss:0.283499 
[20]    train-mlogloss:0.149984 eval-mlogloss:0.274891 
[21]    train-mlogloss:0.141659 eval-mlogloss:0.268080 
[22]    train-mlogloss:0.133263 eval-mlogloss:0.261263 
[23]    train-mlogloss:0.125407 eval-mlogloss:0.255525 
[24]    train-mlogloss:0.117925 eval-mlogloss:0.250536 
[25]    train-mlogloss:0.111371 eval-mlogloss:0.246171 
[26]    train-mlogloss:0.105522 eval-mlogloss:0.242234 
[27]    train-mlogloss:0.100195 eval-mlogloss:0.238797 
[28]    train-mlogloss:0.095323 eval-mlogloss:0.235266 
[29]    train-mlogloss:0.090141 eval-mlogloss:0.232179 
[30]    train-mlogloss:0.086082 eval-mlogloss:0.229155 
[31]    train-mlogloss:0.081895 eval-mlogloss:0.225177 
[32]    train-mlogloss:0.078219 eval-mlogloss:0.223170 
[33]    train-mlogloss:0.074680 eval-mlogloss:0.221246 
[34]    train-mlogloss:0.070858 eval-mlogloss:0.219093 
[35]    train-mlogloss:0.067738 eval-mlogloss:0.217748 
[36]    train-mlogloss:0.065098 eval-mlogloss:0.216182 
[37]    train-mlogloss:0.062028 eval-mlogloss:0.214797 
[38]    train-mlogloss:0.059496 eval-mlogloss:0.213793 
[39]    train-mlogloss:0.056910 eval-mlogloss:0.212119 
[40]    train-mlogloss:0.054699 eval-mlogloss:0.210574 
[41]    train-mlogloss:0.052492 eval-mlogloss:0.209919 
[42]    train-mlogloss:0.050442 eval-mlogloss:0.209348 
[43]    train-mlogloss:0.048298 eval-mlogloss:0.208500 
[44]    train-mlogloss:0.046239 eval-mlogloss:0.207525 
[45]    train-mlogloss:0.043837 eval-mlogloss:0.206795 
[46]    train-mlogloss:0.041920 eval-mlogloss:0.206059 
[47]    train-mlogloss:0.040478 eval-mlogloss:0.205698 
[48]    train-mlogloss:0.038953 eval-mlogloss:0.204660 
[49]    train-mlogloss:0.037501 eval-mlogloss:0.204111 
[50]    train-mlogloss:0.035999 eval-mlogloss:0.202725 
[51]    train-mlogloss:0.034733 eval-mlogloss:0.202490 
[52]    train-mlogloss:0.033403 eval-mlogloss:0.202015 
[53]    train-mlogloss:0.032222 eval-mlogloss:0.201200 
[54]    train-mlogloss:0.031079 eval-mlogloss:0.200641 
[55]    train-mlogloss:0.029928 eval-mlogloss:0.200692 
[56]    train-mlogloss:0.028919 eval-mlogloss:0.200172 
[57]    train-mlogloss:0.027901 eval-mlogloss:0.199742 
[58]    train-mlogloss:0.026743 eval-mlogloss:0.199245 
[59]    train-mlogloss:0.025905 eval-mlogloss:0.198625 
[60]    train-mlogloss:0.025117 eval-mlogloss:0.198325 
[61]    train-mlogloss:0.024466 eval-mlogloss:0.198372 
[62]    train-mlogloss:0.023671 eval-mlogloss:0.198038 
[63]    train-mlogloss:0.022920 eval-mlogloss:0.197873 
[64]    train-mlogloss:0.022205 eval-mlogloss:0.197768 
[65]    train-mlogloss:0.021412 eval-mlogloss:0.197384 
[66]    train-mlogloss:0.020720 eval-mlogloss:0.197906 
[67]    train-mlogloss:0.020052 eval-mlogloss:0.197524 
[68]    train-mlogloss:0.019477 eval-mlogloss:0.197668 
[69]    train-mlogloss:0.018844 eval-mlogloss:0.197758 
[70]    train-mlogloss:0.018307 eval-mlogloss:0.197885 
[71]    train-mlogloss:0.017821 eval-mlogloss:0.197590 
[72]    train-mlogloss:0.017313 eval-mlogloss:0.197406 
[73]    train-mlogloss:0.016777 eval-mlogloss:0.197523 
[74]    train-mlogloss:0.016241 eval-mlogloss:0.197133 
[75]    train-mlogloss:0.015771 eval-mlogloss:0.197171 
[76]    train-mlogloss:0.015318 eval-mlogloss:0.197467 
[77]    train-mlogloss:0.014870 eval-mlogloss:0.197440 
[78]    train-mlogloss:0.014466 eval-mlogloss:0.197473 
[79]    train-mlogloss:0.014131 eval-mlogloss:0.197731 
[80]    train-mlogloss:0.013809 eval-mlogloss:0.197644 
[81]    train-mlogloss:0.013499 eval-mlogloss:0.197318 
[82]    train-mlogloss:0.013141 eval-mlogloss:0.197589 
[83]    train-mlogloss:0.012801 eval-mlogloss:0.197747 
[84]    train-mlogloss:0.012491 eval-mlogloss:0.197841 
[85]    train-mlogloss:0.012179 eval-mlogloss:0.197636 
[86]    train-mlogloss:0.011924 eval-mlogloss:0.197332 
[87]    train-mlogloss:0.011616 eval-mlogloss:0.197180 
[88]    train-mlogloss:0.011380 eval-mlogloss:0.197222 
[89]    train-mlogloss:0.011099 eval-mlogloss:0.197491 
[90]    train-mlogloss:0.010876 eval-mlogloss:0.197282 
[91]    train-mlogloss:0.010589 eval-mlogloss:0.197249 
[92]    train-mlogloss:0.010349 eval-mlogloss:0.197506 
[93]    train-mlogloss:0.010140 eval-mlogloss:0.197970 
[94]    train-mlogloss:0.009939 eval-mlogloss:0.198141 
[95]    train-mlogloss:0.009692 eval-mlogloss:0.198471 
[96]    train-mlogloss:0.009481 eval-mlogloss:0.198527 
[97]    train-mlogloss:0.009279 eval-mlogloss:0.198439 
[98]    train-mlogloss:0.009096 eval-mlogloss:0.198074 
[99]    train-mlogloss:0.008897 eval-mlogloss:0.198102 
[100]   train-mlogloss:0.008731 eval-mlogloss:0.197824 
[101]   train-mlogloss:0.008556 eval-mlogloss:0.197866 
[102]   train-mlogloss:0.008383 eval-mlogloss:0.197806 
[103]   train-mlogloss:0.008228 eval-mlogloss:0.197520 
[104]   train-mlogloss:0.008049 eval-mlogloss:0.197788 
[105]   train-mlogloss:0.007900 eval-mlogloss:0.197800 
[106]   train-mlogloss:0.007729 eval-mlogloss:0.198120 
[107]   train-mlogloss:0.007563 eval-mlogloss:0.198505 
[108]   train-mlogloss:0.007429 eval-mlogloss:0.198300 
[109]   train-mlogloss:0.007293 eval-mlogloss:0.198239 
[110]   train-mlogloss:0.007160 eval-mlogloss:0.198536 
[111]   train-mlogloss:0.007041 eval-mlogloss:0.198668 
[112]   train-mlogloss:0.006910 eval-mlogloss:0.198853 
[113]   train-mlogloss:0.006797 eval-mlogloss:0.199206 
[114]   train-mlogloss:0.006663 eval-mlogloss:0.199274 
[115]   train-mlogloss:0.006545 eval-mlogloss:0.199534 
[116]   train-mlogloss:0.006447 eval-mlogloss:0.199778 
[117]   train-mlogloss:0.006341 eval-mlogloss:0.199769 
[118]   train-mlogloss:0.006230 eval-mlogloss:0.200434 
[119]   train-mlogloss:0.006126 eval-mlogloss:0.200316 
[120]   train-mlogloss:0.006018 eval-mlogloss:0.200517 
[121]   train-mlogloss:0.005931 eval-mlogloss:0.200513 
[122]   train-mlogloss:0.005853 eval-mlogloss:0.200742 
[123]   train-mlogloss:0.005760 eval-mlogloss:0.200660 
[124]   train-mlogloss:0.005664 eval-mlogloss:0.200726 
[125]   train-mlogloss:0.005584 eval-mlogloss:0.200510 
[126]   train-mlogloss:0.005494 eval-mlogloss:0.200713 
[127]   train-mlogloss:0.005411 eval-mlogloss:0.200679 
[128]   train-mlogloss:0.005332 eval-mlogloss:0.200700 
[129]   train-mlogloss:0.005252 eval-mlogloss:0.201104 
[130]   train-mlogloss:0.005167 eval-mlogloss:0.201229 
[131]   train-mlogloss:0.005091 eval-mlogloss:0.201343 
[132]   train-mlogloss:0.005018 eval-mlogloss:0.201516 
[133]   train-mlogloss:0.004960 eval-mlogloss:0.201438 
[134]   train-mlogloss:0.004881 eval-mlogloss:0.201630 
[135]   train-mlogloss:0.004811 eval-mlogloss:0.201697 
[136]   train-mlogloss:0.004753 eval-mlogloss:0.202074 
[137]   train-mlogloss:0.004691 eval-mlogloss:0.202327 
[138]   train-mlogloss:0.004637 eval-mlogloss:0.202381 
[139]   train-mlogloss:0.004581 eval-mlogloss:0.202340 
[140]   train-mlogloss:0.004514 eval-mlogloss:0.202584 
[141]   train-mlogloss:0.004456 eval-mlogloss:0.202810 
[142]   train-mlogloss:0.004397 eval-mlogloss:0.202952 
[143]   train-mlogloss:0.004349 eval-mlogloss:0.202689 
[144]   train-mlogloss:0.004290 eval-mlogloss:0.202738 
[145]   train-mlogloss:0.004235 eval-mlogloss:0.202947 
[146]   train-mlogloss:0.004182 eval-mlogloss:0.202993 
[147]   train-mlogloss:0.004132 eval-mlogloss:0.203193 
[148]   train-mlogloss:0.004081 eval-mlogloss:0.203205 
[149]   train-mlogloss:0.004038 eval-mlogloss:0.203326 
[150]   train-mlogloss:0.003995 eval-mlogloss:0.203395 
[151]   train-mlogloss:0.003952 eval-mlogloss:0.203801 
[152]   train-mlogloss:0.003903 eval-mlogloss:0.203770 
[153]   train-mlogloss:0.003862 eval-mlogloss:0.203783 
[154]   train-mlogloss:0.003817 eval-mlogloss:0.203899 
[155]   train-mlogloss:0.003772 eval-mlogloss:0.203829 
[156]   train-mlogloss:0.003730 eval-mlogloss:0.203774 
[157]   train-mlogloss:0.003685 eval-mlogloss:0.204061 
[158]   train-mlogloss:0.003648 eval-mlogloss:0.203985 
[159]   train-mlogloss:0.003611 eval-mlogloss:0.204184 
[160]   train-mlogloss:0.003573 eval-mlogloss:0.204398 
[161]   train-mlogloss:0.003536 eval-mlogloss:0.204404 
[162]   train-mlogloss:0.003499 eval-mlogloss:0.204376 
[163]   train-mlogloss:0.003462 eval-mlogloss:0.204346 
[164]   train-mlogloss:0.003423 eval-mlogloss:0.204499 
[165]   train-mlogloss:0.003388 eval-mlogloss:0.204599 
[166]   train-mlogloss:0.003352 eval-mlogloss:0.204687 
[167]   train-mlogloss:0.003317 eval-mlogloss:0.204729 
[168]   train-mlogloss:0.003287 eval-mlogloss:0.204787 
[169]   train-mlogloss:0.003257 eval-mlogloss:0.204873 
[170]   train-mlogloss:0.003225 eval-mlogloss:0.204902 
[171]   train-mlogloss:0.003192 eval-mlogloss:0.205109 
[172]   train-mlogloss:0.003162 eval-mlogloss:0.205125 
[173]   train-mlogloss:0.003137 eval-mlogloss:0.204954 
[174]   train-mlogloss:0.003108 eval-mlogloss:0.205048 
[175]   train-mlogloss:0.003084 eval-mlogloss:0.205078 
[176]   train-mlogloss:0.003057 eval-mlogloss:0.205081 
[177]   train-mlogloss:0.003031 eval-mlogloss:0.205129 
[178]   train-mlogloss:0.003005 eval-mlogloss:0.205052 
[179]   train-mlogloss:0.002984 eval-mlogloss:0.205136 
[180]   train-mlogloss:0.002961 eval-mlogloss:0.205221 
[181]   train-mlogloss:0.002938 eval-mlogloss:0.205245 
[182]   train-mlogloss:0.002912 eval-mlogloss:0.205307 
[183]   train-mlogloss:0.002887 eval-mlogloss:0.205428 
[184]   train-mlogloss:0.002868 eval-mlogloss:0.205573 
[185]   train-mlogloss:0.002845 eval-mlogloss:0.205670 
[186]   train-mlogloss:0.002821 eval-mlogloss:0.205713 
[187]   train-mlogloss:0.002799 eval-mlogloss:0.205551 
[188]   train-mlogloss:0.002778 eval-mlogloss:0.205443 
[189]   train-mlogloss:0.002753 eval-mlogloss:0.205433 
[190]   train-mlogloss:0.002733 eval-mlogloss:0.205543 
[191]   train-mlogloss:0.002711 eval-mlogloss:0.205773 
[192]   train-mlogloss:0.002693 eval-mlogloss:0.205855 
[193]   train-mlogloss:0.002672 eval-mlogloss:0.206061 
[194]   train-mlogloss:0.002650 eval-mlogloss:0.206295 
[195]   train-mlogloss:0.002633 eval-mlogloss:0.206343 
[196]   train-mlogloss:0.002613 eval-mlogloss:0.206104 
[197]   train-mlogloss:0.002595 eval-mlogloss:0.206119 
[198]   train-mlogloss:0.002578 eval-mlogloss:0.206140 
[199]   train-mlogloss:0.002561 eval-mlogloss:0.206157 
[200]   train-mlogloss:0.002544 eval-mlogloss:0.206318 
# 特征提取
importance <- xgb.importance(colnames(lym_PA_train), model = bst_model)
head(importance)
xgb.ggplot.importance(head(importance,20),n_clusters = 1) #这个cluster和分类没有关系


multi_featureplot(head(importance,9)$Feature,lym_ds2_PA)

lym_PA_genes <- head(importance, 500) ##选择top500


#混淆矩阵
predict_lym_PA_test <- round(predict(bst_model, newdata = lym_PA_test))

lym_PA_confuse_matrix_test <- table(lym_PA_test_data$label, predict_lym_PA_test, dnn=c("true","pre"))
lym_PA_confuse_matrix_test_prop <- prop.table(lym_PA_confuse_matrix_test, 1)
lym_PA_confuse_matrix_test
    pre
true    0    1    2    3    4    5
   0 1504   53    0   29    0    1
   1   69 1076   16    0    8    1
   2    0   24  538    0    8    0
   3   57    1    0  329    0    0
   4    0   17    6    0  287    0
   5    4    4    4    0    0   88

用AC的lym验证

lym_AC_data <- get_data_table(lym_ds2_AC, highvar = F, type = "data")
lym_AC_label <- as.numeric(as.character(Idents(lym_ds2_AC)))

colnames(lym_AC_data) <- NULL

lym_AC_test_data <- list(data = t(as(lym_AC_data,"dgCMatrix")), label = lym_AC_label)

lym_AC_test <- xgb.DMatrix(data = lym_AC_test_data$data,label = lym_AC_test_data$label)
multi_featureplot(head(importance,9)$Feature,lym_ds2_AC)


#混淆矩阵
predict_lym_AC_test <- round(predict(bst_model, newdata = lym_AC_test))

lym_AC_confuse_matrix_test <- table(lym_AC_test_data$label, predict_lym_AC_test, dnn=c("true","pre"))
lym_AC_confuse_matrix_test_prop <- prop.table(lym_AC_confuse_matrix_test, 1)
lym_AC_confuse_matrix_test
    pre
true    0    1    2    3    4    5
   0 1010    6    0  255    0    0
   1    0   21  625    0   50    0
   2   68  433   17    2    0    2
   3    0   20    0    0  458    0
   4    0    0    0    0    0   23
adjustedRandIndex(predict_lym_AC_test, lym_AC_test_data$label)
[1] 0.7227654
labels <- lapply(levels(Idents(lym_ds2_PA)), paste0, "_PA") %>% as.character()
labels2 <- lapply(levels(Idents(lym_ds2_AC)), paste0, "_AC") %>% as.character()
sources <- rep(0:5, each = 5)  #注意这里的each和times的区别
colors <- rep(colors_list[1:6], each = 5)
targets <- rep(6:10, times = 6)

plot_ly(type = "sankey", orientation = "h",
    node = list(
      label = c(labels,labels2), 
      color = colors_list, pad = 15, thickness = 30,
      line = list(
        color = "black",
        width = 1)),
    link = list(
      source = sources,
      target = targets,
      value =  as.numeric(lym_AC_confuse_matrix_test),
      color = colors
      ))



umapplot(lym_ds2_AC)


umapplot(lym_ds2_PA)

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.

---
title: "R Notebook"
output: html_notebook
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Ctrl+Shift+Enter*. 

```{r}
library(xgboost)
library(Matrix)
library(mclust)
```

```{r}
ds0 <- readRDS("./ds0.rds")
ds1 <- readRDS("./ds1.rds")
ds2 <- readRDS("./ds2.rds")
```

# 分发训练集
```{r}
Idents(ds2) <- ds2$conditions
ds2_AC <- subset(ds2, idents = "AC")
ds2_PA <- subset(ds2, idents = "PA")
ds2_AC <- ds2_AC %>% FindNeighbors(dims = 1:20) %>% FindClusters(resolution = 0.1)
ds2_PA <- ds2_PA %>% FindNeighbors(dims = 1:20) %>% FindClusters(resolution = 0.1)
umapplot(ds2_AC) + scale_y_continuous(limits = c(-5,15),breaks = NULL) +
        scale_x_continuous(limits = c(-5,15),breaks = NULL)
umapplot(ds2_PA)+ scale_y_continuous(limits = c(-5,15),breaks = NULL) +
        scale_x_continuous(limits = c(-5,15),breaks = NULL)

AC_markers <- FindAllMarkers(ds2_AC,logfc.threshold = 0.7, min.diff.pct = 0.2)
PA_markers <- FindAllMarkers(ds2_PA,logfc.threshold = 0.7, min.diff.pct = 0.2)
```

## 在AC上训练
```{r}
AC_data <- get_data_table(ds2_AC, highvar = F, type = "data")
AC_label <- as.numeric(as.character(Idents(ds2_AC)))

set.seed(7)
index <- c(1:dim(AC_data)[2]) %>% sample(ceiling(0.3*dim(AC_data)[2]), replace = F, prob = NULL)

colnames(AC_data) <- NULL

AC_train_data <- list(data = t(as(AC_data[,-index],"dgCMatrix")), label = AC_label[-index])
AC_test_data <- list(data = t(as(AC_data[,index],"dgCMatrix")), label = AC_label[index])

# data(agaricus.train, package='xgboost')

AC_train <- xgb.DMatrix(data = AC_train_data$data,label = AC_train_data$label)
AC_test <- xgb.DMatrix(data = AC_test_data$data,label = AC_test_data$label)

# xgb_params_train = {
#     'objective':'multi:softprob',
#     'eval_metric':'mlogloss',
#     'num_class':self.numbertrainclasses,
#     'eta':0.2,
#     'max_depth':6,
#     'subsample': 0.6}
# nround = 200

watchlist <- list(train = AC_train, eval = AC_test)
xgb_param <- list(eta = 0.2, max_depth = 6, 
                  subsample = 0.6,  num_class = length(table(Idents(ds2_AC))),
                  objective = "multi:softmax", eval_metric = 'mlogloss')

bst_model <- xgb.train(xgb_param, AC_train, nrounds = 200, watchlist, verbose = 0)

# 特征提取
importance <- xgb.importance(colnames(AC_train), model = bst_model)
head(importance)
xgb.ggplot.importance(head(importance,20),n_clusters = 1) #这个cluster和分类没有关系

multi_featureplot(head(importance,9)$Feature,ds2)
AC_genes <- head(importance, 500) ##选择top500


#混淆矩阵
predict_AC_test <- round(predict(bst_model, newdata = AC_test))

AC_confuse_matrix_test <- table(AC_test_data$label, predict_AC_test, dnn=c("true","pre"))
AC_confuse_matrix_test_prop <- prop.table(AC_confuse_matrix_test, 1)
AC_confuse_matrix_test_prop
#ROC曲线

# xgboost_roc <- pROC::multiclass.roc(AC_test_data$label, predict_AC_test) #多分类ROC
xgboost_roc <- pROC::roc(AC_test_data$label, predict_AC_test)
plot(xgboost_roc, print.auc=TRUE, auc.polygon=TRUE, 
  grid=c(0.1, 0.2),grid.col=c("green", "red"), 
  max.auc.polygon=TRUE,auc.polygon.col="skyblue", 
  print.thres=TRUE,main='ROC curve') #前两个分量

```


## 在PA上训练
```{r}
PA_data <- get_data_table(ds2_PA, highvar = F, type = "data")
PA_label <- as.numeric(as.character(Idents(ds2_PA)))

set.seed(7)
index <- c(1:dim(PA_data)[2]) %>% sample(ceiling(0.3*dim(PA_data)[2]), replace = F, prob = NULL)

colnames(PA_data) <- NULL

PA_train_data <- list(data = t(as(PA_data[,-index],"dgCMatrix")), label = PA_label[-index])
PA_test_data <- list(data = t(as(PA_data[,index],"dgCMatrix")), label = PA_label[index])

# data(agaricus.train, pPAkage='xgboost')

PA_train <- xgb.DMatrix(data = PA_train_data$data,label = PA_train_data$label)
PA_test <- xgb.DMatrix(data = PA_test_data$data,label = PA_test_data$label)

# xgb_params_train = {
#     'objective':'multi:softprob',
#     'eval_metric':'mlogloss',
#     'num_class':self.numbertrainclasses,
#     'eta':0.2,
#     'max_depth':6,
#     'subsample': 0.6}
# nround = 200

watchlist <- list(train = PA_train, eval = PA_test)
xgb_param <- list(eta = 0.2, max_depth = 6, 
                  subsample = 0.6,  num_class = length(table(Idents(ds2_PA))),
                  objective = "multi:softmax", eval_metric = 'mlogloss')

bst_model <- xgb.train(xgb_param, PA_train, nrounds = 200, watchlist, verbose = 0)

# 特征提取
importance <- xgb.importance(colnames(PA_train), model = bst_model)
head(importance)
xgb.ggplot.importance(head(importance,20),n_clusters = 1)

multi_featureplot(head(importance,9)$Feature, ds2)
PA_genes <- head(importance, 500) ##选择top500


#混淆矩阵
predict_PA_test <- round(predict(bst_model, newdata = PA_test))

PA_confuse_matrix_test <- table(PA_test_data$label, predict_PA_test, dnn=c("true","pre"))
PA_confuse_matrix_test_prop <- prop.table(PA_confuse_matrix_test,1)
PA_confuse_matrix_test_prop

adjustedRandIndex(PA_test_data$label, predict_PA_test) #PA分类器性能

#ROC曲线

# xgboost_roc <- pROC::multiclass.roc(PA_test_data$label, predict_PA_test) #多分类ROC
xgboost_roc <- pROC::roc(PA_test_data$label, predict_PA_test)
plot(xgboost_roc, print.auc=TRUE, auc.polygon=TRUE, 
  grid=c(0.1, 0.2),grid.col=c("green", "red"), 
  max.auc.polygon=TRUE,auc.polygon.col="skyblue", 
  print.thres=TRUE,main='ROC curve') #前两个分量

```
## 选择特征common genes of top 500
## 使用所有来自PA的细胞训练分类器
## 应用在AC上，计算ARI
```{r}
selected_features <- intersect(PA_genes$Feature, AC_genes$Feature)

PA_data <- get_data_table(ds2_PA, highvar = F, type = "data")
PA_data <- PA_data[selected_features,]
PA_label <- as.numeric(as.character(Idents(ds2_PA)))
colnames(PA_data) <- NULL

PA_train_data <- list(data = t(as(PA_data,"dgCMatrix")), label = PA_label)

PA_train <- xgb.DMatrix(data = PA_train_data$data,label = PA_train_data$label)

xgb_param <- list(eta = 0.2, max_depth = 6, 
                  subsample = 0.6,  num_class = length(table(Idents(ds2_PA))),
                  objective = "multi:softmax", eval_metric = 'mlogloss')

bst_model <- xgb.train(xgb_param, PA_train, nrounds = 200, verbose = 1)

# 特征提取
importance <- xgb.importance(colnames(PA_train), model = bst_model)
head(importance)
xgb.ggplot.importance(head(importance,20),n_clusters = 1)

multi_featureplot(head(importance,9)$Feature, ds2)

```
# 应用到AC上
```{r}
AC_data <- get_data_table(ds2_AC, highvar = F, type = "data")
AC_data <- AC_data[selected_features,]
AC_label <- as.numeric(as.character(Idents(ds2_AC)))
colnames(AC_data) <- NULL

AC_test_data <- list(data = t(as(AC_data,"dgCMatrix")), label = AC_label)

AC_test <- xgb.DMatrix(data = AC_test_data$data,label = AC_test_data$label)

#计算混淆矩阵
predict_AC_test <- round(predict(bst_model, newdata = AC_test))

AC_confuse_matrix_test <- table(AC_test_data$label, predict_AC_test, dnn=c("true","pre"))
AC_confuse_matrix_test_prop <- prop.table(AC_confuse_matrix_test,1)
AC_confuse_matrix_test_prop  #分析发育轨迹

#ROC曲线
# xgboost_roc <- pROC::multiclass.roc(AC_test_data$label, predict_AC_test) #多分类ROC
xgboost_roc <- pROC::roc(AC_test_data$label, predict_AC_test)
plot(xgboost_roc, print.auc=TRUE, auc.polygon=TRUE, 
  grid=c(0.1, 0.2),grid.col=c("green", "red"), 
  max.auc.polygon=TRUE,auc.polygon.col="skyblue", 
  print.thres=TRUE,main='ROC curve') #前两个分量ROC

multi_featureplot(head(importance,4)$Feature, ds2, split.by = "conditions")
multi_featureplot(head(importance,9)$Feature, ds2_AC)

# 计算ARI 
 
adjustedRandIndex(predict_AC_test, AC_test_data$label)
```

# 选择特征common genes of top 500
## 使用所有来自AC的细胞训练分类器

```{r}
AC_data <- get_data_table(ds2_AC, highvar = F, type = "data")
AC_data <- AC_data[selected_features,]
AC_label <- as.numeric(as.character(Idents(ds2_AC)))
colnames(AC_data) <- NULL

AC_train_data <- list(data = t(as(AC_data,"dgCMatrix")), label = AC_label)

AC_train <- xgb.DMatrix(data = AC_train_data$data,label = AC_train_data$label)

xgb_ACram <- list(eta = 0.2, max_depth = 6, 
                  subsample = 0.6,  num_class = length(table(Idents(ds2_AC))),
                  objective = "multi:softmax", eval_metric = 'mlogloss')

bst_model2 <- xgb.train(xgb_ACram, AC_train, nrounds = 200, verbose = 1)

# 特征提取
importance2 <- xgb.importance(colnames(AC_train), model = bst_model2)
head(importance2)
xgb.ggplot.importance(head(importance2,20),n_clusters = 1)

multi_featureplot(head(importance2,9)$Feature, ds2)
```


## 应用在PA上，计算ARI
```{r}
PA_data <- get_data_table(ds2_PA, highvar = F, type = "data")
PA_data <- PA_data[selected_features,]
PA_label <- as.numeric(as.character(Idents(ds2_PA)))
colnames(PA_data) <- NULL

PA_test_data <- list(data = t(as(PA_data,"dgCMatrix")), label = PA_label)

PA_test <- xgb.DMatrix(data = PA_test_data$data,label = PA_test_data$label)

#计算混淆矩阵
predict_PA_test <- round(predict(bst_model2, newdata = PA_test))

PA_confuse_matrix_test <- table(PA_test_data$label, predict_PA_test, dnn=c("true","pre"))
PA_confuse_matrix_test_prop <- prop.table(PA_confuse_matrix_test,1)
PA_confuse_matrix_test_prop  #分析发育轨迹

#ROC曲线
# xgboost_roc <- pROC::multiclass.roc(PA_test_data$label, predict_PA_test) #多分类ROC
xgboost_roc <- pROC::roc(PA_test_data$label, predict_PA_test)
plot(xgboost_roc, print.auc=TRUE, auc.polygon=TRUE, 
  grid=c(0.1, 0.2),grid.col=c("green", "red"), 
  max.auc.polygon=TRUE,auc.polygon.col="skyblue", 
  print.thres=TRUE,main='ROC curve') #前两个分量ROC

# 计算ARI 

adjustedRandIndex(predict_PA_test, PA_test_data$label)
```



```{r}
umapplot(ds2,split.by = "conditions")
table(ds2$conditions)
```


# sankey plot

## PA -> AC     ARI 0.3024837
    pre
true           0           1           2
   0 0.980360065 0.003273322 0.016366612
   1 0.799516908 0.196859903 0.003623188
   2 0.453004622 0.493066256 0.053929122
   3 0.002762431 0.052486188 0.944751381
## AC ->PA    ARI 0.1797689
    pre
true           0           1           2           3
   0 0.027107438 0.287272727 0.682644628 0.002975207
   1 0.000349895 0.075227432 0.914975507 0.009447166
   2 0.008130081 0.003252033 0.175609756 0.813008130
```{r}
library(plotly)

plot_ly(type = "sankey", orientation = "h",
    node = list(
      label = c("PA_0", "PA_1", "PA_2", "AC_0","AC_1","AC_2","AC_3"), 
      color = colors_list, pad = 15, thickness = 30,
      line = list(
        color = "black",
        width = 1)),
    link = list(
      source = c(0,0,0,0,1,1,1,1,2,2,2,2),
      target = c(3,4,5,6,3,4,5,6,3,4,5,6),
      value =  as.numeric(AC_confuse_matrix_test),
      color = c("#66C2A5","#66C2A5","#66C2A5","#66C2A5","#FC8D62","#FC8D62","#FC8D62",
                "#FC8D62","#8DA0CB","#8DA0CB","#8DA0CB","#8DA0CB")
      ))

AC_confuse_matrix_test
t(PA_confuse_matrix_test)
# fig <- fig %>% layout(
#     title = "Basic Sankey Diagram",
#     font = list(
#       size = 10 ))
# 

umapplot(ds2_AC)
umapplot(ds2_PA)
umapplot(ds2,split.by = "conditions")
```

# varify 部分

# 数据集CA_dataset1

## 在AC上训练  使用top500 in AC
```{r}
temp <- get_data_table(ds2_AC, highvar = F, type = "data")
AC_data <- matrix(data=0, nrow = length(AC_genes$Feature), ncol = length(colnames(temp)), byrow = FALSE, dimnames = list(AC_genes$Feature,colnames(temp)))

for(i in intersect(AC_genes$Feature, rownames(temp))){
  AC_data[i,] <- temp[i,]
}
rm(temp)

AC_label <- as.numeric(as.character(Idents(ds2_AC)))

set.seed(7)
index <- c(1:dim(AC_data)[2]) %>% sample(ceiling(0.3*dim(AC_data)[2]), replace = F, prob = NULL)

colnames(AC_data) <- NULL

AC_train_data <- list(data = t(as(AC_data[,-index],"dgCMatrix")), label = AC_label[-index])
AC_test_data <- list(data = t(as(AC_data[,index],"dgCMatrix")), label = AC_label[index])

AC_train <- xgb.DMatrix(data = AC_train_data$data,label = AC_train_data$label)
AC_test <- xgb.DMatrix(data = AC_test_data$data,label = AC_test_data$label)


watchlist <- list(train = AC_train, eval = AC_test)
xgb_param <- list(eta = 0.2, max_depth = 6, 
                  subsample = 0.6,  num_class = length(table(Idents(ds2_AC))),
                  objective = "multi:softmax", eval_metric = 'mlogloss')

bst_model <- xgb.train(xgb_param, AC_train, nrounds = 200, watchlist, verbose = 0)

# 特征提取
importance <- xgb.importance(colnames(AC_train), model = bst_model)
head(importance)
xgb.ggplot.importance(head(importance,20),n_clusters = 1) #这个cluster和分类没有关系

multi_featureplot(head(importance,9)$Feature,ds2)


#混淆矩阵
predict_AC_test <- round(predict(bst_model, newdata = AC_test))

AC_confuse_matrix_test <- table(AC_test_data$label, predict_AC_test, dnn=c("true","pre"))
AC_confuse_matrix_test_prop <- prop.table(AC_confuse_matrix_test, 1)
AC_confuse_matrix_test_prop
#ROC曲线

# xgboost_roc <- pROC::multiclass.roc(AC_test_data$label, predict_AC_test) #多分类ROC
xgboost_roc <- pROC::roc(AC_test_data$label, predict_AC_test)
plot(xgboost_roc, print.auc=TRUE, auc.polygon=TRUE, 
  grid=c(0.1, 0.2),grid.col=c("green", "red"), 
  max.auc.polygon=TRUE,auc.polygon.col="skyblue", 
  print.thres=TRUE,main='ROC curve') #前两个分量

```



```{r}
ds1 <- ds1 %>% FindNeighbors(dims = 1:20) %>% FindClusters(resolution = 0.3)
umapplot(ds1)
ds1 <- RenameIdents(ds1,'0' = '0','1' ='0','2'='1','3'='2','4' = '3','5' = '1')
f("FBLN1",ds1)
ds1_markers <- FindAllMarkers(ds1,logfc.threshold = 0.5, min.diff.pct = 0.2)
```

```{r}
temp <- get_data_table(ds1, highvar = F, type = "data")
ds1_data <- matrix(data=0, nrow = length(AC_genes$Feature), ncol = length(colnames(temp)), byrow = FALSE, dimnames = list(AC_genes$Feature,colnames(temp)))

for(i in intersect(AC_genes$Feature, rownames(temp))){
  ds1_data[i,] <- temp[i,]
}
rm(temp)

ds1_label <- as.numeric(as.character(Idents(ds1)))
colnames(ds1_data) <- NULL

ds1_test_data <- list(data = t(as(ds1_data,"dgCMatrix")), label = ds1_label)

ds1_test <- xgb.DMatrix(data = ds1_test_data$data,label = ds1_test_data$label)

#计算混淆矩阵
predict_ds1_test <- round(predict(bst_model, newdata = ds1_test))

ds1_data_confuse_matrix_test <- table(ds1_test_data$label, predict_ds1_test, dnn=c("true","pre"))
ds1_data_confuse_matrix_test_prop <- prop.table(ds1_data_confuse_matrix_test,1)
ds1_data_confuse_matrix_test
ds1_data_confuse_matrix_test_prop  #分析发育轨迹



#ROC曲线
# xgboost_roc <- pROC::multiclass.roc(ds1_test_data$label, predict_ds1_test) #多分类ROC
xgboost_roc <- pROC::roc(ds1_test_data$label, predict_ds1_test)
plot(xgboost_roc, print.auc=TRUE, auc.polygon=TRUE, 
  grid=c(0.1, 0.2),grid.col=c("green", "red"), 
  max.auc.polygon=TRUE,auc.polygon.col="skyblue", 
  print.thres=TRUE,main='ROC curve')  #前两个分量ROC

# 计算ARI 

adjustedRandIndex(predict_ds1_test, ds1_test_data$label)
```


# 冠状动脉数据集
```{r}
ds0 <- ds0 %>% FindNeighbors(dims = 1:20) %>% FindClusters(resolution = 0.1)
umapplot(ds0)
f("MYH11",ds0)
ds0_markers <- FindAllMarkers(ds0,logfc.threshold = 0.7, min.diff.pct = 0.2)
```

```{r}
selected_features <- AC_genes$Feature
temp <- get_data_table(ds0, highvar = F, type = "data")
ds0_data <- matrix(data=0, nrow = length(selected_features), ncol = length(colnames(temp)), byrow = FALSE, dimnames = list(selected_features,colnames(temp)))


for(i in intersect(selected_features,rownames(temp))){
  ds0_data[i,] <- temp[i,]
}
# rm(temp)

ds0_label <- as.numeric(as.character(Idents(ds0)))
colnames(ds0_data) <- NULL

ds0_test_data <- list(data = t(as(ds0_data,"dgCMatrix")), label = ds0_label)

ds0_test <- xgb.DMatrix(data = ds0_test_data$data,label = ds0_test_data$label)

#计算混淆矩阵
predict_ds0_test <- round(predict(bst_model, newdata = ds0_test))

ds0_data_confuse_matrix_test <- table(ds0_test_data$label, predict_ds0_test, dnn=c("true","pre"))
ds0_data_confuse_matrix_test_prop <- prop.table(ds0_data_confuse_matrix_test,1)
ds0_data_confuse_matrix_test
ds0_data_confuse_matrix_test_prop  #分析发育轨迹



#ROC曲线
# xgboost_roc <- pROC::multiclass.roc(ds0_test_data$label, predict_ds0_test) #多分类ROC
xgboost_roc <- pROC::roc(ds0_test_data$label, predict_ds0_test)
plot(xgboost_roc, print.auc=TRUE, auc.polygon=TRUE, 
  grid=c(0.1, 0.2),grid.col=c("green", "red"), 
  max.auc.polygon=TRUE,auc.polygon.col="skyblue", 
  print.thres=TRUE,main='ROC curve') #前两个分量ROC

# 计算ARI 

adjustedRandIndex(predict_ds0_test, ds0_test_data$label)
```

```{r}
# load("./init.RData")
multi_featureplot(head(importance2,9)$Feature, ds2_AC)
multi_featureplot(head(importance2,9)$Feature, ds0)
multi_featureplot(head(importance2,9)$Feature, ds1)
f("MYH11", ds2_AC)
umapplot(ds0)
```


# 淋巴细胞

```{r}
Idents(lym_ds2) <- lym_ds2$conditions
lym_ds2_AC <- subset(lym_ds2, idents = "AC")
lym_ds2_PA <- subset(lym_ds2, idents = "PA")
lym_ds2_AC <- lym_ds2_AC %>% FindNeighbors(dims = 1:20) %>% FindClusters(resolution = 0.2)
umapplot(lym_ds2_AC)
lym_ds2_PA <- lym_ds2_PA %>% FindNeighbors(dims = 1:20) %>% FindClusters(resolution = 0.2)
umapplot(lym_ds2_PA)
```

## 用PA的lym训练
```{r}
lym_PA_data <- get_data_table(lym_ds2_PA, highvar = F, type = "data")
lym_PA_label <- as.numeric(as.character(Idents(lym_ds2_PA)))

set.seed(7)
index <- c(1:dim(lym_PA_data)[2]) %>% sample(ceiling(0.3*dim(lym_PA_data)[2]), replace = F, prob = NULL)

colnames(lym_PA_data) <- NULL

lym_PA_train_data <- list(data = t(as(lym_PA_data[,-index],"dgCMatrix")), label = lym_PA_label[-index])
lym_PA_test_data <- list(data = t(as(lym_PA_data[,index],"dgCMatrix")), label = lym_PA_label[index])

# data(agaricus.train, plym_PAkage='xgboost')

lym_PA_train <- xgb.DMatrix(data = lym_PA_train_data$data,label = lym_PA_train_data$label)
lym_PA_test <- xgb.DMatrix(data = lym_PA_test_data$data,label = lym_PA_test_data$label)

watchlist <- list(train = lym_PA_train, eval = lym_PA_test)
xgb_param <- list(eta = 0.2, max_depth = 6, 
                  subsample = 0.6,  num_class = length(table(Idents(lym_ds2_PA))),
                  objective = "multi:softmax", eval_metric = 'mlogloss')

bst_model <- xgb.train(xgb_param, lym_PA_train, nrounds = 200, watchlist, verbose = 1)

# 特征提取
importance <- xgb.importance(colnames(lym_PA_train), model = bst_model)
head(importance)
xgb.ggplot.importance(head(importance,20),n_clusters = 1) #这个cluster和分类没有关系

multi_featureplot(head(importance,9)$Feature,lym_ds2_PA)
lym_PA_genes <- head(importance, 500) ##选择top500


#混淆矩阵
predict_lym_PA_test <- round(predict(bst_model, newdata = lym_PA_test))

lym_PA_confuse_matrix_test <- table(lym_PA_test_data$label, predict_lym_PA_test, dnn=c("true","pre"))
lym_PA_confuse_matrix_test_prop <- prop.table(lym_PA_confuse_matrix_test, 1)
lym_PA_confuse_matrix_test
```


## 用AC的lym验证
```{r}
lym_AC_data <- get_data_table(lym_ds2_AC, highvar = F, type = "data")
lym_AC_label <- as.numeric(as.character(Idents(lym_ds2_AC)))

colnames(lym_AC_data) <- NULL

lym_AC_test_data <- list(data = t(as(lym_AC_data,"dgCMatrix")), label = lym_AC_label)

lym_AC_test <- xgb.DMatrix(data = lym_AC_test_data$data,label = lym_AC_test_data$label)
multi_featureplot(head(importance,9)$Feature,lym_ds2_AC)

#混淆矩阵
predict_lym_AC_test <- round(predict(bst_model, newdata = lym_AC_test))

lym_AC_confuse_matrix_test <- table(lym_AC_test_data$label, predict_lym_AC_test, dnn=c("true","pre"))
lym_AC_confuse_matrix_test_prop <- prop.table(lym_AC_confuse_matrix_test, 1)
lym_AC_confuse_matrix_test
adjustedRandIndex(predict_lym_AC_test, lym_AC_test_data$label)


labels <- lapply(levels(Idents(lym_ds2_PA)), paste0, "_PA") %>% as.character()
labels2 <- lapply(levels(Idents(lym_ds2_AC)), paste0, "_AC") %>% as.character()
sources <- rep(0:5, each = 5)  #注意这里的each和times的区别
colors <- rep(colors_list[1:6], each = 5)
targets <- rep(6:10, times = 6)

plot_ly(type = "sankey", orientation = "h",
    node = list(
      label = c(labels,labels2), 
      color = colors_list, pad = 15, thickness = 30,
      line = list(
        color = "black",
        width = 1)),
    link = list(
      source = sources,
      target = targets,
      value =  as.numeric(lym_AC_confuse_matrix_test),
      color = colors
      ))



umapplot(lym_ds2_AC)

umapplot(lym_ds2_PA)
```

Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Ctrl+Alt+I*.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Ctrl+Shift+K* to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike *Knit*, *Preview* does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
